Setup and Import Data

import pandas as pd
import numpy as np
import seaborn as sns
np.random.seed(44)

from google.colab import drive
drive.mount("/content/gdrive")

train = pd.read_csv('/content/gdrive/My Drive/3253 Machine Learning Term Project/train.csv')
test = pd.read_csv('/content/gdrive/My Drive/3253 Machine Learning Term Project/test.csv')
print(train.shape)
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
(63999, 37)

Data Exploration

train.shape
(63999, 37)
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63999 entries, 0 to 63998
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Customer_id                  63999 non-null  object 
 1   Target                       63999 non-null  int64  
 2   Balance                      63999 non-null  float64
 3   PreviousCampaignResult       63999 non-null  int64  
 4   Product1                     63999 non-null  int64  
 5   Product2                     63999 non-null  int64  
 6   Product3                     63999 non-null  int64  
 7   Product4                     63999 non-null  int64  
 8   Product5                     63999 non-null  int64  
 9   Product6                     63999 non-null  int64  
 10  Transaction1                 63999 non-null  float64
 11  Transaction2                 63999 non-null  float64
 12  Transaction3                 63999 non-null  float64
 13  Transaction4                 63999 non-null  float64
 14  Transaction5                 63999 non-null  float64
 15  Transaction6                 63999 non-null  float64
 16  Transaction7                 63999 non-null  float64
 17  Transaction8                 63999 non-null  float64
 18  Transaction9                 63999 non-null  float64
 19  ExternalAccount1             63999 non-null  int64  
 20  ExternalAccount2             63999 non-null  int64  
 21  ExternalAccount3             63999 non-null  int64  
 22  ExternalAccount4             63999 non-null  int64  
 23  ExternalAccount5             63999 non-null  int64  
 24  ExternalAccount6             63999 non-null  int64  
 25  ExternalAccount7             63999 non-null  int64  
 26  ActivityIndicator            63999 non-null  int64  
 27  RegularInteractionIndicator  63999 non-null  int64  
 28  CompetitiveRate1             63999 non-null  float64
 29  CompetitiveRate2             63999 non-null  float64
 30  CompetitiveRate3             63999 non-null  float64
 31  CompetitiveRate4             63999 non-null  float64
 32  CompetitiveRate5             63999 non-null  float64
 33  CompetitiveRate6             63999 non-null  float64
 34  CompetitiveRate7             63999 non-null  float64
 35  RateBefore                   63999 non-null  float64
 36  ReferenceRate                63999 non-null  float64
dtypes: float64(19), int64(17), object(1)
memory usage: 18.1+ MB
#count the unique values in "Target", which is the prediction we are going to make
train["Target"].value_counts()
0    32014
1    31985
Name: Target, dtype: int64
#Check if the data is balanced
%matplotlib inline
import matplotlib.pyplot as plt

def check_data_balance(series, style="seaborn-pastel"):
  with plt.style.context(style):
    unique = series.value_counts()
    display(unique) #show unique value counts of the target
    plt.pie(unique, explode=[0.05]*len(unique), labels=unique.index, autopct='%1.1f%%'); #plot a pie chart for the target to see if data are balanced
check_data_balance(train["Target"])
0    32014
1    31985
Name: Target, dtype: int64
#inspect the data using the dataframe's describe() function
train.describe()
Target Balance PreviousCampaignResult Product1 Product2 Product3 Product4 Product5 Product6 Transaction1 Transaction2 Transaction3 Transaction4 Transaction5 Transaction6 Transaction7 Transaction8 Transaction9 ExternalAccount1 ExternalAccount2 ExternalAccount3 ExternalAccount4 ExternalAccount5 ExternalAccount6 ExternalAccount7 ActivityIndicator RegularInteractionIndicator CompetitiveRate1 CompetitiveRate2 CompetitiveRate3 CompetitiveRate4 CompetitiveRate5 CompetitiveRate6 CompetitiveRate7 RateBefore ReferenceRate
count 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 6.399900e+04 6.399900e+04 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000 63999.000000
mean 0.499773 22697.014218 0.001484 0.200441 0.160456 0.001766 0.092158 0.022125 0.041876 312.852441 98.306560 972.439108 13346.706243 5.872490e+03 8.831981e+03 101.191833 1522.355254 1123.682674 0.197034 0.301911 0.082533 0.123783 0.191862 0.100423 0.019313 19.776778 0.595291 1.379736 1.836435 1.391770 1.757322 1.882755 1.678434 1.412233 1.678434 1.315226
std 0.500004 10819.489043 0.039303 0.400333 0.367031 0.041983 0.289251 0.147092 0.200307 4932.061568 897.296635 5903.126316 25046.228309 2.226038e+04 3.051680e+04 894.873343 7410.544303 9570.708952 0.397762 0.459090 0.275176 0.329337 0.393768 0.300566 0.137623 104.827436 1.162643 0.206340 0.370122 0.161833 0.357483 0.516006 0.160600 0.575083 0.160600 0.106634
min 0.000000 10000.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -34.340000 -33813.000000 0.000000e+00 -2.092436e+04 -957.010000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.025000 1.425000 1.125000 1.425000 1.175000 1.425000 0.525000 1.425000 1.125000
25% 0.000000 13499.005000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.425000 1.575000 1.425000 1.575000 1.575000 1.675000 0.525000 1.675000 1.375000
50% 0.000000 19982.570000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5670.000000 0.000000e+00 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.000000 0.000000 1.425000 1.925000 1.425000 1.675000 2.375000 1.675000 1.575000 1.675000 1.375000
75% 1.000000 30104.035000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 17500.000000 2.000000e+03 5.500000e+03 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 12.000000 1.000000 1.575000 2.375000 1.575000 1.675000 2.375000 1.875000 1.575000 1.875000 1.375000
max 1.000000 50000.000000 2.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 505000.000000 85500.000000 633506.040000 917005.000000 1.330000e+06 2.029492e+06 36186.980000 297055.190000 513627.920000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 6762.000000 56.000000 1.575000 2.375000 1.575000 2.375000 2.375000 1.875000 2.125000 1.875000 1.375000
%matplotlib inline
import matplotlib.pyplot as plt

#draw histogram of each feature
train.hist(bins=50, figsize=(20,15))

#save_fig("attribute_histogram_plots")
plt.show()
import plotly.express as px


fig = px.scatter_matrix(train.iloc[:,1:8], height=1000)
fig.show()

fig = px.scatter_matrix(train.iloc[:,8:16], height=1000)
fig.show()

fig = px.scatter_matrix(train.iloc[:,16:24], height=1000)
fig.show()

fig = px.scatter_matrix(train.iloc[:,24:32], height=1000)
fig.show()

fig = px.scatter_matrix(train.iloc[:,32:37], height=1000)
fig.show()
len(train.columns)
37
# Check PreviousCampaignResult feature values - almost all of them are zero

train['PreviousCampaignResult'].value_counts()
0    63906
1       91
2        2
Name: PreviousCampaignResult, dtype: int64
# Check values for Product features

products = train[['Product1', 'Product2', 'Product3', 'Product4', 'Product5', 'Product6']]
#products

products.apply(pd.Series.value_counts)
Product1 Product2 Product3 Product4 Product5 Product6
0 51171 53730 63886 58101 62583 61319
1 12828 10269 113 5898 1416 2680
# Products value counts in % contribution

products.apply(lambda x: pd.value_counts(x, normalize=True).mul(100).round(1).astype(str) + '%')
Product1 Product2 Product3 Product4 Product5 Product6
0 80.0% 84.0% 99.8% 90.8% 97.8% 95.8%
1 20.0% 16.0% 0.2% 9.2% 2.2% 4.2%
# Box plots for Transaction features

plt.subplot(331)
sns.boxplot(train["Transaction1"])

plt.subplot(332)
sns.boxplot(train["Transaction2"])

plt.subplot(333)
sns.boxplot(train["Transaction3"])

plt.subplot(334)
sns.boxplot(train["Transaction4"])

plt.subplot(335)
sns.boxplot(train["Transaction5"])

plt.subplot(336)
sns.boxplot(train["Transaction6"])

plt.subplot(337)
sns.boxplot(train["Transaction7"])

plt.subplot(338)
sns.boxplot(train["Transaction8"])

plt.subplot(339)
sns.boxplot(train["Transaction9"])

fig = plt.gcf()
fig.set_size_inches(10,10)
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning:

Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning:

Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning:

Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning:

Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning:

Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning:

Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning:

Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning:

Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning:

Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

# Count non-zero values in each Transaction column

transactions = train[['Transaction1', 'Transaction2', 'Transaction3', 'Transaction4', 'Transaction5', 'Transaction6', 'Transaction7', 
                      'Transaction8', 'Transaction9']]
# transactions

transactions.apply(lambda x: np.count_nonzero(x))
Transaction1     1856
Transaction2     3651
Transaction3     4895
Transaction4    46538
Transaction5    19515
Transaction6    25255
Transaction7     3033
Transaction8     4672
Transaction9     6221
dtype: int64
# Check values for External Account features

external_accounts = train[['ExternalAccount1', 'ExternalAccount2', 'ExternalAccount3', 'ExternalAccount4', 'ExternalAccount5', 'ExternalAccount6', 'ExternalAccount7']]
# external_accounts

external_accounts.apply(lambda x: pd.value_counts(x, normalize=True).mul(100).round(1).astype(str) + '%')
ExternalAccount1 ExternalAccount2 ExternalAccount3 ExternalAccount4 ExternalAccount5 ExternalAccount6 ExternalAccount7
0 80.3% 69.8% 91.7% 87.6% 80.8% 90.0% 98.1%
1 19.7% 30.2% 8.3% 12.4% 19.2% 10.0% 1.9%
# Check values for Activity Indicator feature

sns.boxplot(train['ActivityIndicator'])
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning:

Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

<matplotlib.axes._subplots.AxesSubplot at 0x7eff8660bfd0>
# Group values into buckets by percentile

p50 = np.percentile(train["ActivityIndicator"],50)
p75 = np.percentile(train["ActivityIndicator"],75)
p99 = np.percentile(train["ActivityIndicator"],99)
bins = [-1, 0, p50, p75, p99, np.inf]
train["ActivityIndicator"].value_counts(bins=bins, sort=False, normalize=True).mul(100).round(1).astype(str) + '%'
(-1.001, 0.0]    31.5%
(0.0, 4.0]       24.5%
(4.0, 12.0]      21.2%
(12.0, 316.0]    21.7%
(316.0, inf]      1.0%
Name: ActivityIndicator, dtype: object
# Check target variable for customers with no activity

inactive = train[train['ActivityIndicator'] == 0]
inactive["Target"].value_counts()
0    12636
1     7547
Name: Target, dtype: int64
inactive["Target"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
0    62.6%
1    37.4%
Name: Target, dtype: object
# Check target variable for customers with some activity

active = train[train['ActivityIndicator']  != 0]
active["Target"].value_counts()
1    24438
0    19378
Name: Target, dtype: int64
active["Target"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
1    55.8%
0    44.2%
Name: Target, dtype: object
# Check values for Regular Interaction Indicator feature

sns.countplot(train["RegularInteractionIndicator"])
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning:

Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

<matplotlib.axes._subplots.AxesSubplot at 0x7eff84c4fd10>
# Group values into buckets by percentile

p50 = np.percentile(train["RegularInteractionIndicator"],50)
p75 = np.percentile(train["RegularInteractionIndicator"],75)
p99 = np.percentile(train["RegularInteractionIndicator"],99)
bins = [-1, p50, p75, p99, np.inf]
train["RegularInteractionIndicator"].value_counts(bins=bins, sort=False, normalize=True).mul(100).round(1).astype(str) + '%'
(-1.001, 0.0]    63.1%
(0.0, 1.0]       25.3%
(1.0, 5.0]       10.8%
(5.0, inf]        0.8%
Name: RegularInteractionIndicator, dtype: object
# Check target variable for customers with zero interaction frequency score

infrequent = train[train['RegularInteractionIndicator'] == 0]
infrequent["Target"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
0    50.7%
1    49.3%
Name: Target, dtype: object
# Summarize rate offers

rates = train[['CompetitiveRate1', 'CompetitiveRate2', 'CompetitiveRate4', 'CompetitiveRate4', 'CompetitiveRate5', 'CompetitiveRate6', 'CompetitiveRate7', 
               'RateBefore', 'ReferenceRate']]
rates.apply(lambda x: pd.value_counts(x, normalize=True).mul(100).round(1).astype(str) + '%')
CompetitiveRate1 CompetitiveRate2 CompetitiveRate4 CompetitiveRate4 CompetitiveRate5 CompetitiveRate6 CompetitiveRate7 RateBefore ReferenceRate
0.525 NaN NaN NaN NaN NaN NaN 25.7% NaN NaN
1.025 23.9% NaN NaN NaN NaN NaN NaN NaN NaN
1.125 NaN NaN NaN NaN NaN NaN NaN NaN 23.9%
1.175 NaN NaN NaN NaN 23.9% NaN NaN NaN NaN
1.375 NaN NaN NaN NaN NaN NaN NaN NaN 76.1%
1.425 26.7% 23.8% 23.8% 23.8% NaN 23.9% NaN 23.9% NaN
1.475 23.8% NaN NaN NaN NaN NaN 23.9% NaN NaN
1.575 25.7% 25.7% 25.7% 25.7% 25.7% NaN 26.7% NaN NaN
1.675 NaN NaN 26.7% 26.7% NaN 26.7% NaN 26.7% NaN
1.725 NaN NaN NaN NaN NaN 23.8% NaN 23.8% NaN
1.875 NaN NaN NaN NaN NaN 25.7% NaN 25.7% NaN
1.925 NaN 23.9% NaN NaN NaN NaN NaN NaN NaN
2.125 NaN NaN NaN NaN NaN NaN 23.8% NaN NaN
2.375 NaN 26.7% 23.9% 23.9% 50.4% NaN NaN NaN NaN
# Check if there's correlation between balance and activity/interaction indicators

train_corr = train[['Balance', 'ActivityIndicator',	'RegularInteractionIndicator']].copy()
# sns.pairplot(train_corr)
sns.heatmap(train_corr.corr(),
            annot=True,
            linewidth=.5,
            center = 0,
            fmt='.1g',
            cbar=False,
            cmap='GnBu')
<matplotlib.axes._subplots.AxesSubplot at 0x7eff9a05f950>
 

Observations:

  • Training dataset contains 63,999 instances and 37 features
  • Dataset doesn't have null values
  • All features are numerical (except for Customer ID)
  • Target variable is balanced, i.e. equally divided between positive and negative label
  • Balance variable is not normally distributed, it takes values between 10,000 and 50,000 with median of around 20,000
  • Almost all values of Previous Campaign Result variable are zero. Since most values are the same, it could potentially be excluded from the model
  • Product 3 is only used by 0.2% of customers
  • 31.5% of customers did not perform any activity with the bank, and 62.6% of them did not respond to previous campaigns (i.e. target = 0)
  • 63.1% of customers had zero interaction frequency score, however target variable for these customers was distributed evenly
  • Majority of customers had the rate of 1.375% after negotiations
  • There seems to be no correlation between account balance and number/frequency of customer activity

Prepare Train Data

  • Remove outliers
  • Add dummy variables
  • Split Train data further into train and validation sets
X = train.drop(['Target'], axis=1)
X = X.set_index('Customer_id')
y = train[['Target']]

Remove outliers

Remove outliers on Balance

Use 1.5xIQR rule to remove outliers. Anything beyond 1.5 x 3rd quartile are considered high outliers, anything lower than -1.5 x 1st quartile are low outliers.

High outliers and low outliers will be be replaced by median of balance feature.

X['Balance'].quantile([0.25,0.5,0.75])
q1 = X['Balance'].quantile([0.25])[0.25]
q3 = X['Balance'].quantile([0.75])[0.75]
q2 = X['Balance'].quantile([0.5])[0.5]
q4 = X['Balance'].quantile([1])[1]
IQR = q3 - q1 
low_outliers = q1 - 1.5* IQR
high_outliers = q3 + 1.5*IQR
X['Balance'] = X['Balance'].apply( lambda x:  X[['Balance']].median()['Balance'] if x>high_outliers else x)
X['Balance'] = X['Balance'].apply( lambda x:  X[['Balance']].median()['Balance'] if x<low_outliers else x)
from google.colab import files

X[['Balance']].boxplot().get_figure().savefig('outlier_balance.png')

files.download('outlier_balance.png')

Isolate the transaction feature per customer

X_transactions = X.loc[:,'Transaction1':'Transaction9']

Use DBSCAN to remove outliers for each of transaction/feature of transaction

from sklearn.cluster import DBSCAN
DBSCAN = DBSCAN(eps=2000, min_samples=4, n_jobs=-1).fit(X_transactions)

Below, I'm checking how many instances are per cluster. The variances are so far in between that there are many clusters. I'll be isolating those with positive values, and treat those with -1 values as outliers

import sys
# pd.set_option("display.max_rows", None, "display.max_columns", None)
np.set_printoptions(threshold=sys.maxsize)
labels=DBSCAN.labels_
labels_df =  pd.DataFrame(labels, columns=['cluster'])
labels_df['cluster'].value_counts().sort_index()  
-1       9605
 0      53334
 1          7
 2         11
 3          4
        ...  
 139        6
 140        3
 141        3
 142        4
 143        4
Name: cluster, Length: 145, dtype: int64

These are the outliers or the ones with clusters of value -1, and they will be considered as outliers

X_transactions.iloc[labels_df[ labels_df['cluster']==-1].index, :]
Transaction1 Transaction2 Transaction3 Transaction4 Transaction5 Transaction6 Transaction7 Transaction8 Transaction9
Customer_id
CTSP000013 0.0 0.0 6861.24 17823.08 22263.88 18016.13 0.00 11555.00 39604.67
CTSP000018 0.0 0.0 0.00 5000.00 0.00 60623.56 0.00 0.00 10391.00
CTSP000022 0.0 2360.0 1000.00 16940.00 6331.53 25.00 5441.78 0.00 122.96
CTSP000031 0.0 0.0 0.00 0.00 0.00 438181.08 0.00 0.00 0.00
CTSP000032 0.0 632.0 13315.25 33861.77 19215.45 32281.25 3644.25 27008.81 632.00
... ... ... ... ... ... ... ... ... ...
CTSP063966 36583.0 2860.0 4424.00 26972.69 15245.79 54940.06 0.00 6500.00 0.00
CTSP063969 0.0 1580.0 4921.52 48948.69 17921.52 154890.04 0.00 39448.69 0.00
CTSP063970 0.0 20.0 0.00 2941.94 0.00 5320.00 1676.03 0.00 3009.29
CTSP063975 0.0 0.0 0.00 190000.00 143000.00 0.00 0.00 0.00 0.00
CTSP063989 0.0 0.0 16886.44 145472.55 119628.04 446400.00 0.00 0.00 112677.32

9605 rows × 9 columns

Categorize index of rows with outliers and separate them from those that are not outliers (cluster=0)

outliers_index = X_transactions.iloc[labels_df[ labels_df['cluster']==-1].index, :].index
good_index = X_transactions.iloc[labels_df[ ~labels_df['cluster']==-1].index, :].index

Non-zero values of clusters with value -1 above will be replaced by the median of each transaction feature

 good_transactions_median = X.loc[good_index.tolist(), 'Transaction1':'Transaction9'].median()

Replace the outliers with the median (without outliers) for each feature

outliers = X.loc[outliers_index.tolist(), 'Transaction1':'Transaction9'].quantile([0.25])

X.loc[outliers_index.tolist(), 'Transaction1'] = X.loc[outliers_index.tolist(), 'Transaction1'].apply(lambda x:  good_transactions_median['Transaction1'] if x >= outliers['Transaction1'][0.25] else x)
X.loc[outliers_index.tolist(), 'Transaction2'] = X.loc[outliers_index.tolist(), 'Transaction2'].apply(lambda x:  good_transactions_median['Transaction2'] if x >= outliers['Transaction2'][0.25] else x)
X.loc[outliers_index.tolist(), 'Transaction3'] = X.loc[outliers_index.tolist(), 'Transaction3'].apply(lambda x:  good_transactions_median['Transaction3'] if x >= outliers['Transaction3'][0.25] else x)
X.loc[outliers_index.tolist(), 'Transaction4'] = X.loc[outliers_index.tolist(), 'Transaction4'].apply(lambda x:  good_transactions_median['Transaction4'] if x >= outliers['Transaction4'][0.25] else x)
X.loc[outliers_index.tolist(), 'Transaction5'] = X.loc[outliers_index.tolist(), 'Transaction5'].apply(lambda x:  good_transactions_median['Transaction5'] if x >= outliers['Transaction5'][0.25] else x)
X.loc[outliers_index.tolist(), 'Transaction6'] = X.loc[outliers_index.tolist(), 'Transaction6'].apply(lambda x:  good_transactions_median['Transaction6'] if x >= outliers['Transaction6'][0.25] else x)
X.loc[outliers_index.tolist(), 'Transaction7'] = X.loc[outliers_index.tolist(), 'Transaction7'].apply(lambda x:  good_transactions_median['Transaction7'] if x >= outliers['Transaction7'][0.25] else x)
X.loc[outliers_index.tolist(), 'Transaction8'] = X.loc[outliers_index.tolist(), 'Transaction8'].apply(lambda x:  good_transactions_median['Transaction8'] if x >= outliers['Transaction8'][0.25] else x)
X.loc[outliers_index.tolist(), 'Transaction9'] = X.loc[outliers_index.tolist(), 'Transaction9'].apply(lambda x:  good_transactions_median['Transaction9'] if x >= outliers['Transaction9'][0.25] else x)

Add dummy variables

Using the CombinedAttributesAdder()

from sklearn.base import BaseEstimator, TransformerMixin



class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, remove_cols=None): # no *args or **kargs
        if remove_cols is not None:
            self.remove_cols = remove_cols

    def fit(self, X, y=None):
        return self  # nothing else to do

    def transform(self, X):

        # feature to get % change from rate before and current rate
        diff_interest_rate = X[['RateBefore','ReferenceRate']].pct_change(axis=1)['ReferenceRate'].rename('diff_interest_rate')

        # feature to get # of product per customer
        count_product = X.loc[:,'Product1':'Product6'].sum(axis=1) 

        # feature to get # of product per customer
        count_external_acct = X.loc[:,'ExternalAccount1':'ExternalAccount7'].sum(axis=1)

        #feature to get how many transactions customer made out of 9 possible
        count_transactions = X.loc[:,'Transaction1':'Transaction9'].applymap(lambda x: 1 if x>0 else 0).sum(axis=1)

        # feature to get ave $ of transaction
        ave_transactions = X.loc[:,'Transaction1':'Transaction9'].T.mean()

        # feature to get max transaction ($) per customer
        max_transactions = X.loc[:,'Transaction1':'Transaction9'].T.max()

        # feature to get min transaction ($) per customer
        min_transactions = X.loc[:,'Transaction1':'Transaction9'].T.min()

        # feature to get median transaction ($) per customer
        median_transactions = X.loc[:,'Transaction1':'Transaction9'].T.median()

        # feature to get standard deviation of transactions ($) per customer
        std_transactions = X.loc[:,'Transaction1':'Transaction9'].T.std()

        # feature to get sum of transaction ($) per customer
        sum_transactions = X.loc[:,'Transaction1':'Transaction9'].T.sum()
        
        # feature to get 1st quartile $ of transaction ($) per customer
        q1_transactions = X.loc[:,'Transaction1':'Transaction9'].T.quantile(.25)

        # feature to get 2nd quartile/median $ of transaction ($) per customer
        q2_transactions = X.loc[:,'Transaction1':'Transaction9'].T.quantile(.5)

        # feature to get 3rd quartile $ of transaction ($) per customer
        q3_transactions = X.loc[:,'Transaction1':'Transaction9'].T.quantile(.75)
        
        # feature to get IQR of transaction ($) per customer
        IQR_transactions = q3_transactions - q1_transactions 

        # feature to identify outlier of transaction ($) per customer
        low_outliers_transactions = q1_transactions - 1.5* IQR_transactions
        high_outliers_transactions = q3_transactions + 1.5*IQR_transactions

        # feature to get ave rate offered per customer
        ave_rate= X.loc[:,'CompetitiveRate1':'CompetitiveRate7'].T.mean()

        # feature to get max rate offered to customer
        max_rate = X.loc[:,'CompetitiveRate1':'CompetitiveRate7'].T.max()

        # feature to get min rate offered to customer
        min_rate = X.loc[:,'CompetitiveRate1':'CompetitiveRate7'].T.min()

        # feature to get median rate offered to customer
        median_rate = X.loc[:,'CompetitiveRate1':'CompetitiveRate7'].T.median()

        # feature to get standard deviation of rates offered to customer
        std_rate = X.loc[:,'CompetitiveRate1':'CompetitiveRate7'].T.std()

        # attempt to create a better activity indicator using previous campaign result feature
        new_act_indicator = X['PreviousCampaignResult'] * X['ActivityIndicator']

        # attempt to create a better regular interaction indicator using previous campaign result feature
        new_reg_act_indicator = X['PreviousCampaignResult'] * X['RegularInteractionIndicator  ']

        new_count_product = X.loc[:,'Product1':'Product6'].sum(axis=1) * X['PreviousCampaignResult']

        # potential interest savings in $ customer gained from new rate
        interest_savings =  (X['RateBefore'] - X['ReferenceRate'] ) * X.loc[:,'Transaction1':'Transaction9'].T.sum()

        prod_to_ext_acct = count_product.divide(count_external_acct, fill_value=0).fillna(0).replace(np.inf, 0)

        newreference_prodcount = (X['ReferenceRate'] * count_product).fillna(0)

        newreference_sumtransactions = X['ReferenceRate'] * sum_transactions

        regact_diffrate = X['ActivityIndicator'] * diff_interest_rate 

        ave_transactions_diffrate =  X['ActivityIndicator'] * ave_transactions

        balance_diffrate = X['Balance'] * diff_interest_rate


        if self.remove_cols: # optional based on user input / hyperparameter
            
            X = pd.concat([X, diff_interest_rate.rename('test_diff_interest_rate'), count_external_acct.rename('count_external_acct'), count_transactions.rename('count_transactions'), 
                              ave_transactions.rename('ave_transactions'), max_transactions.rename('max_transactions'),
                              min_transactions.rename('min_transactions'), median_transactions.rename('median_transactions'), std_transactions.rename('std_transactions'), ave_rate.rename('ave_rate'), min_rate.rename('min_rate'),
                              median_rate.rename('median_rate'), std_rate.rename('std_rate'), new_count_product.rename('new_count_product'), interest_savings.rename('interest_savings'), prod_to_ext_acct.rename('prod_to_ext_acct'), 
                              newreference_prodcount.rename('newreference_prodcount'), newreference_sumtransactions.rename('newreference_sumtransactions'),
                              regact_diffrate.rename('regact_diffrate'), balance_diffrate.rename('balance_diffrate'), q1_transactions.rename('q1_transactions'),
                              q2_transactions.rename('q2_transactions'), q3_transactions.rename('q3_transactions'), IQR_transactions.rename('IQR_transactions'), low_outliers_transactions.rename('low_outliers_transactions'),
                              high_outliers_transactions.rename('high_outliers_transactions')
                           ], axis=1)
            X = X.drop(columns=self.remove_cols)

            print("Shape after: ", X.shape)
            return X.values
        else:
            X = pd.concat([X, diff_interest_rate.rename('test_diff_interest_rate'), count_external_acct.rename('count_external_acct'), count_transactions.rename('count_transactions'), 
                              ave_transactions.rename('ave_transactions'), max_transactions.rename('max_transactions'),
                              min_transactions.rename('min_transactions'), median_transactions.rename('median_transactions'), std_transactions.rename('std_transactions'), ave_rate.rename('ave_rate'), min_rate.rename('min_rate'),
                              median_rate.rename('median_rate'), std_rate.rename('std_rate'), new_count_product.rename('new_count_product'), interest_savings.rename('interest_savings'), prod_to_ext_acct.rename('prod_to_ext_acct'), 
                              newreference_prodcount.rename('newreference_prodcount'), newreference_sumtransactions.rename('newreference_sumtransactions'),
                              regact_diffrate.rename('regact_diffrate'), balance_diffrate.rename('balance_diffrate'), q1_transactions.rename('q1_transactions'),
                              q2_transactions.rename('q2_transactions'), q3_transactions.rename('q3_transactions'), IQR_transactions.rename('IQR_transactions'), low_outliers_transactions.rename('low_outliers_transactions'),
                              high_outliers_transactions.rename('high_outliers_transactions')
                           ], axis=1)
            
            global cols #create global variable so you can see the column names on RF's feature importance chart
            cols=X.columns

            return X.values

Use pipeline to transform (add new features and normalize) the dataset

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

num_pipeline = Pipeline([
        ('attribs_adder', CombinedAttributesAdder(remove_cols=[])),
        ('minmax_scaler', MinMaxScaler()),
    ])

X = num_pipeline.fit_transform(X)
y = y.values.ravel()

Split train and validation

from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

Train Model B: Random Forest

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score




# Refer to
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score 
# to identify correct scoring. there are variations of f1 score

## GRADIENT SEARCH TAKING UP TOO MUCH TIME
param_grid = [{     'n_estimators':  np.arange(180, 250, 5).tolist(),
                    "max_depth":  np.arange(2, 15, 4).tolist(), 
                    "min_samples_leaf": [1,2,5,10],
                    "max_features": [5,10, 20],
                    "bootstrap": [True, False]}]
RF = RandomForestClassifier() 
rf_clf = GridSearchCV(RF, param_grid, cv=5 ,scoring='f1')

rf_clf.fit(X_train, y_train)
RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=6, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=230,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

Pickle the model for future use

from joblib import dump, load
import pickle
from google.colab import files

import datetime
currentDT = datetime. datetime. now()
date = currentDT.strftime("%Y-%m-%d %H")


pickle.dump(rf_clf, open(f'rf_clf_{date}.joblib', 'wb'))
files.download(f'rf_clf_{date}.joblib')
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix

yhat_train = rf_clf.predict(X_train)
yhat_valid= rf_clf.predict(X_valid)


tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train, yhat_train).ravel()
tn_valid, fp_valid, fn_valid, tp_valid = confusion_matrix(y_valid, yhat_valid).ravel()

print("True Negative: ", tn_train, " | False positive: ", fp_train, 
      " | False negative: ",fn_train, " | True positive: ",tp_train)
print("Recall score: ", recall_score(y_train, yhat_train ))
print("Precision score: ", precision_score(y_train, yhat_train))

print("")

print("True Negative: ", tn_valid, " | False positive: ", fp_valid, 
      " | False negative: ",fn_valid, " | True positive: ",tp_valid)
print("Recall score: ", recall_score(y_valid, yhat_valid ))
print("Precision score: ", precision_score(y_valid, yhat_valid))
True Negative:  15894  | False positive:  9686  | False negative:  9263  | True positive:  16356
Recall score:  0.6384324134431477
Precision score:  0.6280623608017817

True Negative:  3910  | False positive:  2524  | False negative:  2344  | True positive:  4022
Recall score:  0.6317939051209551
Precision score:  0.6144210204705164
print("Train f1 Score: ", f1_score(y_train, yhat_train))
print("Validation f1 Score: ", f1_score(y_valid, yhat_valid))
Train f1 Score:  0.6332049321538492
Validation f1 Score:  0.6229863692688972

Model B.2: Random Forest with KFold

Use KFold as a variation

from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=4, random_state=42)

for train_index, valid_index in skfolds.split(X, y):
    clone_clf = clone(rf_clf)

    X_train_folds = X[train_index]
    y_train_folds = y[train_index]

    X_valid_fold = X[valid_index]
    y_valid_fold = y[valid_index]

    clone_clf.fit(X_train_folds, y_train_folds)
    
    yhat_train = clone_clf.predict(X_train_folds)
    print('Train Accuracy: ', accuracy_score(y_train_folds, yhat_train))
    print("Train f1 Score: ", f1_score(y_train_folds, yhat_train))

    yhat_valid = clone_clf.predict(X_valid_fold)
    n_correct = sum(yhat_valid == y_valid_fold)

    print('Validation Accuracy: ', accuracy_score(y_valid_fold, yhat_valid))
    print("Validation f1 Score: ", f1_score(y_valid_fold, yhat_valid))

    print("")
/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_split.py:296: FutureWarning:

Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.

Train Accuracy:  0.629179774578637
Train f1 Score:  0.6302121205825524
Validation Accuracy:  0.62625
Validation f1 Score:  0.6298130493995295

Train Accuracy:  0.6287005979291235
Train f1 Score:  0.6350194552529183
Validation Accuracy:  0.623
Validation f1 Score:  0.6303468562323814

Train Accuracy:  0.6309298110377299
Train f1 Score:  0.6365482858373852
Validation Accuracy:  0.617875
Validation f1 Score:  0.6227788746298124

Train Accuracy:  0.631375
Train f1 Score:  0.6367331855136733
Validation Accuracy:  0.6134133383336459
Validation f1 Score:  0.6171937859751191

See each features' importance relative to rest using RF's Feature Importance

Create dictionary to change feature indices to feature names

col_dict = dict(zip(range(0, len(cols)), cols))

Plot each fearure's relative importance

import matplotlib.pyplot as plt

rf_clf.feature_importances_


importances = rf_clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_clf.estimators_],
             axis=0)
indices = np.argsort(importances)

# Plot the feature importances of the forest
plt.figure(figsize=(80,80))
plt.rcParams.update({'font.size': 40})
plt.title("Feature importances")
plt.barh(range(X.shape[1]), (importances[indices]) ,
       color="r", xerr=std[indices], align="center")
# If you want to define your own labels,
# change indices to a list of labels on the following line.
plt.yticks(range(X.shape[1]), [col_dict[k] for k in indices])
plt.ylim([-1, X.shape[1]])
plt.show()

Train Model C: KNN

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

KNN = KNeighborsClassifier()
param_grid = [{'n_neighbors':  [180,200,220,250]}]

grid_search_KNN = GridSearchCV(KNN, param_grid, cv=5 ,scoring='f1')
grid_search_KNN.fit(X_train, y_train)
GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid=[{'n_neighbors': [180, 200, 220, 250]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1', verbose=0)
grid_search_KNN.best_params_
{'n_neighbors': 180}
knn_clf = grid_search_KNN.best_estimator_
from joblib import dump, load
import pickle
pickle.dump(knn_clf, open("knn_clf_no_all_outliers.joblib", 'wb'))
from google.colab import files
files.download('knn_clf_no_all_outliers.joblib')
yhat_train = knn_clf.predict(X_train)
yhat_valid= knn_clf.predict(X_valid)


tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train, yhat_train).ravel()
tn_valid, fp_valid, fn_valid, tp_valid = confusion_matrix(y_valid, yhat_valid).ravel()

print("True Negative: ", tn_train, " | False positive: ", fp_train, 
      " | False negative: ",fn_train, " | True positive: ",tp_train)
print("Recall score: ", recall_score(y_train, yhat_train ))
print("Precision score: ", precision_score(y_train, yhat_train))

print("")

print("True Negative: ", tn_valid, " | False positive: ", fp_valid, 
      " | False negative: ",fn_valid, " | True positive: ",tp_valid)
print("Recall score: ", recall_score(y_valid, yhat_valid ))
print("Precision score: ", precision_score(y_valid, yhat_valid))
True Negative:  18220  | False positive:  7360  | False negative:  13474  | True positive:  12145
Recall score:  0.47406221944650456
Precision score:  0.6226608561907203

True Negative:  4559  | False positive:  1875  | False negative:  3336  | True positive:  3030
Recall score:  0.4759660697455231
Precision score:  0.617737003058104

Model E: Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

learning_rates = [ 0.025, 0.05, 0.1, 0.25,0.35]
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=200, learning_rate = learning_rate, max_features=5, max_depth = 8, random_state = 0)
    gb.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_valid, y_valid)))
Learning rate:  0.025
Accuracy score (training): 0.678
Accuracy score (validation): 0.624
Learning rate:  0.05
Accuracy score (training): 0.698
Accuracy score (validation): 0.623
Learning rate:  0.1
Accuracy score (training): 0.728
Accuracy score (validation): 0.621
Learning rate:  0.25
Accuracy score (training): 0.796
Accuracy score (validation): 0.608
Learning rate:  0.35
Accuracy score (training): 0.833
Accuracy score (validation): 0.601
from sklearn.metrics import mean_squared_error
from joblib import dump, load
from sklearn.ensemble import GradientBoostingClassifier

errors = [mean_squared_error(y_valid, y_pred)
          for y_pred in gb.staged_predict(X_valid)]
bst_n_estimators = np.argmin(errors) + 1

gb_clf = GradientBoostingClassifier(max_depth=2, n_estimators=bst_n_estimators, random_state=42) # init=voting_clf tells GB to use voting classifier
gb_clf.fit(X_train, y_train)
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=2,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=12,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict


yhat_train = gb_clf.predict(X_train)
yhat_valid= gb_clf.predict(X_valid)


confusion_matrix(y_train, yhat_train)
precision = precision_score(y_train, yhat_train)
recall =recall_score(y_train, yhat_train)
print("Train Precision: ", precision)
print("Train Recall: ", recall)
print("Train f1 Score: ", f1_score(y_train, yhat_train))
Train Precision:  0.6045731597359975
Train Recall:  0.6543190600726023
Train f1 Score:  0.6284632399805047
gb_clf.get_params
<bound method BaseEstimator.get_params of GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=2,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=12,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)>
yhat_valid_precision = precision_score(y_valid, yhat_valid)
yhat_valid_recall = recall_score(y_valid, yhat_valid)
print("Validation Precision: ", yhat_valid_precision)
print("Validation Recall: ", yhat_valid_recall)
print("Validation f1 Score: ", f1_score(y_valid, yhat_valid))
Validation Precision:  0.5991652274035694
Validation Recall:  0.653942821237826
Validation f1 Score:  0.6253567673126033

Train Model D: Voting Classifier

from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from joblib import dump, load

svm_clf = SVC(gamma="scale", random_state=42, probability=True)

voting_clf = VotingClassifier(
    estimators=[('rf', rf_clf), ('svc', svm_clf),  ('knn', knn_clf), ('gb', gb_clf)],
    voting='soft')

voting_clf.fit(X_train, y_train)
VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(bootstrap=False,
                                                     ccp_alpha=0.0,
                                                     class_weight='balanced',
                                                     criterion='gini',
                                                     max_depth=6,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=230,
                                                     n_jobs=None,
                                                     oob_s...
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         n_estimators=12,
                                                         n_iter_no_change=None,
                                                         presort='deprecated',
                                                         random_state=42,
                                                         subsample=1.0,
                                                         tol=0.0001,
                                                         validation_fraction=0.1,
                                                         verbose=0,
                                                         warm_start=False))],
                 flatten_transform=True, n_jobs=None, voting='soft',
                 weights=None)
from sklearn.metrics import f1_score

for clf in (rf_clf, svm_clf, knn_clf, voting_clf):
    print(clf)
    clf.fit(X_train, y_train)
    yhat_valid = clf.predict(X_valid)
    print(clf.__class__.__name__, f1_score(y_valid, yhat_valid))
RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=6, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=230,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
RandomForestClassifier 0.6229863692688972
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=42, shrinking=True, tol=0.001,
    verbose=False)
SVC 0.5343594386600271
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=180, p=2,
                     weights='uniform')
KNeighborsClassifier 0.5376630290125101
VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(bootstrap=False,
                                                     ccp_alpha=0.0,
                                                     class_weight='balanced',
                                                     criterion='gini',
                                                     max_depth=6,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=230,
                                                     n_jobs=None,
                                                     oob_s...
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         n_estimators=12,
                                                         n_iter_no_change=None,
                                                         presort='deprecated',
                                                         random_state=42,
                                                         subsample=1.0,
                                                         tol=0.0001,
                                                         validation_fraction=0.1,
                                                         verbose=0,
                                                         warm_start=False))],
                 flatten_transform=True, n_jobs=None, voting='soft',
                 weights=None)
VotingClassifier 0.602590209334528
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

# yhat_valid = voting_clf.predict(X_valid)
yhat_valid_precision = precision_score(y_valid, yhat_valid)
yhat_valid_recall = recall_score(y_valid, yhat_valid)
print("Validation Precision: ", yhat_valid_precision)
print("Validation Recall: ", yhat_valid_recall)
print("Validation f1 Score: ", f1_score(y_valid, yhat_valid))
Validation Precision:  0.6257824395195398
Validation Recall:  0.5810556079170593
Validation f1 Score:  0.602590209334528

Select Final Model and run on validation dataset

test = test.set_index('Customer_id')
prepared_test = num_pipeline.fit_transform(test)
final_predictions = rf_clf.predict(prepared_test)

Run the selected final model on Test dataset (for submission on Kaggle)

pd.concat([pd.DataFrame(test.index), pd.DataFrame(final_predictions)], axis=1).to_csv('submission.csv', index=False, header=['Customer_id', 'Target'])
from google.colab import files
files.download('submission.csv')